read data:
getwd()
## [1] "E:/2021\u79cb\u62db/BaiduNetdiskWorkspace/\u54e5\u5927qmss/DV/Group_T_digital-products"
setwd("E:/2021秋招/BaiduNetdiskWorkspace/哥大qmss/DV/Group_T_digital-products/data")
bar <- read.csv("bar.csv")
barreview <- read.csv("barreview.csv")
bus_attr <- read.csv("business_allatt.csv")
user <- read.csv("10%eliteuser.csv")
load library:
#library(tidyverse)
library(stringr)
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(DT)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(leaflet)
Map Visualization for Vegas Bars
library(readr)
library(ggplot2)
library(ggthemes)
library(maps)
library(dplyr)
library("DT")
library(stringr)
library("leaflet")
library(RColorBrewer)
library("data.table")
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
register_google(key = "AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg", write = TRUE)
## Replacing old key (AIzaSyBlZS1dMHmz5qAZww5VQRBBSlJa3VFfyVg) with new key in C:\Users\lenove\Documents/.Renviron
Las Vegas Map
map_lv <- get_map("Las Vegas",
zoom = 12,
source = "stamen",
maptype = "toner-background")
## Source : https://maps.googleapis.com/maps/api/staticmap?center=Las%20Vegas&zoom=12&size=640x640&scale=2&maptype=terrain&key=xxx
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Las+Vegas&key=xxx
## Source : http://tile.stamen.com/toner-background/12/736/1604.png
## Source : http://tile.stamen.com/toner-background/12/737/1604.png
## Source : http://tile.stamen.com/toner-background/12/738/1604.png
## Source : http://tile.stamen.com/toner-background/12/739/1604.png
## Source : http://tile.stamen.com/toner-background/12/736/1605.png
## Source : http://tile.stamen.com/toner-background/12/737/1605.png
## Source : http://tile.stamen.com/toner-background/12/738/1605.png
## Source : http://tile.stamen.com/toner-background/12/739/1605.png
## Source : http://tile.stamen.com/toner-background/12/736/1606.png
## Source : http://tile.stamen.com/toner-background/12/737/1606.png
## Source : http://tile.stamen.com/toner-background/12/738/1606.png
## Source : http://tile.stamen.com/toner-background/12/739/1606.png
## Source : http://tile.stamen.com/toner-background/12/736/1607.png
## Source : http://tile.stamen.com/toner-background/12/737/1607.png
## Source : http://tile.stamen.com/toner-background/12/738/1607.png
## Source : http://tile.stamen.com/toner-background/12/739/1607.png
ggmap(map_lv)
g_location <- ggmap(map_lv) + theme_map()
g_location + geom_point(data=bar, aes(x=longitude,y=latitude),
size=0.3, alpha=0.3, color="blue")
g_density <- g_location + geom_density2d(aes(x=longitude,y=latitude),
data=bar, color="green", size=1, bins=12) +
stat_density2d(aes(x=longitude,y=latitude,
fill = ..level.., alpha = ..level..),
data=bar, geom = 'polygon', bins=12) +
scale_fill_gradient2(low = "green", mid="yellow", high = "red") +
scale_alpha(range = c(0.00, 0.5))
g_density
# Visualize the neighborhood each bar belongs to
#add legend of stars
lvbar_map_neighborhood <-
leaflet(bar) %>%
addTiles() %>% # Add OpenStreetMap map tiles
addCircles(lng = ~longitude, lat = ~latitude)
pal = colorFactor("Set1", domain = bar$neighborhood) # Grab a palette
color_neighborhood = pal(bar$neighborhood)
lvbar_map_neighborhood %>% addCircles(color=color_neighborhood) %>%
addLegend(pal = pal, values = ~bar$neighborhood, title = "Neighborhood")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
content <- paste("Name:",bar$name,"<br/>",
"Address:",bar$address,"<br/>",
"Stars:",bar$stars,"<br/>",
"Neighborhood:", bar$neighborhood,"<br/>")
pal = colorFactor("YlOrRd", domain = bar$stars) # Grab a palette
color_stars = pal(bar$stars)
lvbar_map_neighborhood %>% addCircles(color=color_stars, popup = content) %>%
addLegend(pal = pal, values = ~bar$stars, title = "Stars")
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
Yelp bar review- text analysis:
Most Popular Categories regarding bars:
fillColor = "#FFA07A"
fillColor2 = "#F1C40F"
categories = str_split(bar$categories,";")
categories = as.data.frame(unlist(categories))
colnames(categories) = c("Name")
categories %>%
group_by(Name) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup() %>%
mutate(Name = reorder(Name,Count)) %>%
head(10) %>%
ggplot(aes(x = Name,y = Count)) +
geom_bar(stat='identity',colour="white", fill =fillColor2) +
geom_text(aes(x = Name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 4, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of Category', y = 'Count',
title = 'Top 10 Categories regarding bars') +
coord_flip() +
theme_bw()
Bars with most number of five Star Reviews:
stars_5 <- barreview %>%
filter(stars ==5) %>%
group_by(business_id) %>%
select(business_id,stars,text) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup()
five = merge(stars_5, bar, by= "business_id")
five2 <- five %>%
filter(stars ==5) %>%
filter(is_open==1)
fivestar <- five2 %>%
arrange(stars) %>%
head(10)
fillColor2 = "#F1C40F"
fivestar %>%
mutate(name = reorder(name,Count)) %>%
ggplot(aes(x = name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 2, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of the Bars',
y = 'Count',
title = 'Name of the bars and Count') +
coord_flip() +
theme_bw()
Most 5 starred bar - J Karaoke Bar:
J_karaoke = bar %>% filter(business_id == "3pSUr_cdrphurO6m1HMP9A") %>%
select(name,neighborhood,city,state,postal_code,categories)
datatable(head(J_karaoke), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
A wordcloud to see the common words of reviews on “J Karaoke Bar”
createWordCloud = function(train)
{
train %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(barreview %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))
From the wordcloud, we can derive the ingisht that people praise the atmosphere, music, cleaning environment, services and food(especially chicken) in this bar, and indicates that they spend happy and comfortable time in this J Karaoke bar.
Similarly, let’s visualize the bars with most number of one star reviews:
#library()
stars_1 <- barreview %>%
filter(stars ==1|stars==1.5) %>%
group_by(business_id) %>%
select(business_id,stars,text) %>%
summarise(Count = n()) %>%
arrange(desc(Count)) %>%
ungroup()
one = merge(stars_1, bar, by= "business_id")
one2 <- one %>%
filter(stars ==1|stars==1.5) %>%
filter(is_open==1)
onestar <- one2 %>%
arrange(stars) %>%
head(10)
fillColor2 = "#F1C40F"
onestar %>%
mutate(name = reorder(name,Count)) %>%
ggplot(aes(x = name,y = Count)) +
geom_bar(stat='identity',colour="white", fill = fillColor2) +
geom_text(aes(x = name, y = 1, label = paste0("(",Count,")",sep="")),
hjust=0, vjust=.5, size = 2, colour = 'black',
fontface = 'bold') +
labs(x = 'Name of the Bars',
y = 'Count',
title = 'Name of the bars and Count') +
coord_flip() +
theme_bw()
Surprisingly, the bar named “Triumph property management” only has one star rating, and there are 12 reviews on that bar.
So we are interested to see the common words of reviews on “Triumph property management”:
createWordCloud = function(train)
{
train %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
head(30) %>%
with(wordcloud(word, n, max.words = 30,colors=brewer.pal(8, "Dark2")))
}
createWordCloud(barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))
People in their reviews complain about the house/environment of the bar.
Let’s create a datatable to see some information regarding “Triumph property management”:
Triumph = bar %>% filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ") %>%
select(name,neighborhood,city,state,postal_code,categories)
datatable(head(Triumph), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
define 5-star and 1-star bar datasets with reviews
goodbar <- barreview %>%
filter(stars == 5) %>%
group_by(business_id) %>%
ungroup()
star_five <- merge(goodbar,bar,by = "business_id")
badbar <- barreview %>%
filter(stars == 1) %>%
group_by(business_id) %>%
ungroup()
star_one <- merge(badbar,bar,by = "business_id")
preprocessing reviews
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
df_five = data.frame(doc_id = star_five$business_id, text = star_five$text,stringsAsFactors = F)
star_five2 <- DataframeSource(df_five)
star_five2 <- VCorpus(star_five2)
df_one = data.frame(doc_id = star_one$business_id, text = star_one$text,stringsAsFactors = F)
star_one2 <- DataframeSource(df_one)
star_one2 <- VCorpus(star_one2)
#Remove unnecessary words(stop words), synatx, punctuation, numbers, white space etc.
library(stringr)
remove_nonalphanum <- function(x){str_replace_all(x, "[^[:alnum:]]", " ")}
remove_brandnames <- function(x){str_replace_all(x, "\\b[A-Z]+\\b", " ")}
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(remove_nonalphanum))
corpus <- tm_map(corpus, content_transformer(remove_brandnames))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(stopwords("en")))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
#cleaning two datasets
star_five_clean <- clean_corpus(star_five2)
star_one_clean <- clean_corpus(star_one2)
#create a document-term-matrix:
library(tm)
#create the dtm from the corpus
corpus_five_dtm <- DocumentTermMatrix(star_five_clean)
corpus_one_dtm <- DocumentTermMatrix(star_one_clean)
#provide a word cloud of the most frequent words for "five_star" bars and "one_star" bars
library(tidytext)
corpus_five_dt <- tidy(corpus_five_dtm)
corpus_one_dt <- tidy(corpus_one_dtm)
head(corpus_five_dt)
## # A tibble: 6 x 3
## document term count
## <chr> <chr> <dbl>
## 1 -1m9o3vGRA8IBPNvNqKLmA adjust 1
## 2 -1m9o3vGRA8IBPNvNqKLmA amazing 1
## 3 -1m9o3vGRA8IBPNvNqKLmA awesome 1
## 4 -1m9o3vGRA8IBPNvNqKLmA beautiful 1
## 5 -1m9o3vGRA8IBPNvNqKLmA best 1
## 6 -1m9o3vGRA8IBPNvNqKLmA dark 1
#tf-idf
corpus_five_tdidf <- corpus_five_dt %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
corpus_one_tdidf <- corpus_one_dt %>%
bind_tf_idf(term, document, count) %>%
arrange(desc(tf_idf))
head(corpus_five_tdidf)
## # A tibble: 6 x 6
## document term count tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 nSQBL7E9JdN40Dcg-QiFJA tyrone 2 0.182 6.01 1.09
## 2 fxHqXVqED2jYwCSgkKJTjA idiosyncratic 2 0.111 7.11 0.790
## 3 kpC__sWtWkLdSOI2xxdirg smithwicks 1 0.167 4.71 0.785
## 4 C3IwicBceqbeY1-JNuqd0g chillm 2 0.105 7.11 0.748
## 5 QgWPqUuDFm5wF5UpNECZYg phillip 2 0.133 4.91 0.655
## 6 ri4JWuvJQOOkoeWN5ZNC9A wildcats 1 0.0909 7.11 0.646
term_frequency_DT_five <- corpus_five_tdidf
term_frequency_DT_one <- corpus_one_tdidf
library(wordcloud)
#Set seed
set.seed(2103)
#create a wordcloud to show the frequent words of five stars bars
wordcloud(term_frequency_DT_five$term, term_frequency_DT_five$tf,
max.words=50, colors=brewer.pal(8, "Dark2"))
#create a wordcloud to show the frequent words of one stars bars
wordcloud(term_frequency_DT_one$term, term_frequency_DT_one$tf,
max.words=50, colors=brewer.pal(8, "Dark2"))
A pyramid plot to show how the words between five-stars and one-stars bars differ in word frequency:
#combine corpus of the most successful and unsuccessful projects
#select top 20 words
corpus_five_dt$bestworst <- "Top"
corpus_one_dt$bestworst <- "Bottom"
corpus_top_bottom_dt <- rbind(corpus_five_dt,corpus_one_dt)
corpus_top_bottom_count <- corpus_top_bottom_dt %>%
group_by(term) %>%
summarize(total_word = sum(count)) %>%
arrange(desc(total_word)) %>%
head(20)
pyramid = left_join(corpus_top_bottom_dt, corpus_top_bottom_count, by='term')
pyramid <- pyramid %>%
filter(!is.na(total_word)) %>%
group_by(bestworst) %>%
mutate(count_plot = ifelse(bestworst == 'Bottom', count*(-1), count))
ggplot(pyramid, aes(x = reorder(term, total_word),
y = count_plot, fill = bestworst)) +
geom_bar(data = filter(pyramid, bestworst == "Top"), stat = "identity") +
geom_bar(data = filter(pyramid, bestworst == "Bottom"), stat = "identity") +
scale_fill_brewer(palette = "Set1", direction=-1) +
coord_flip() +
scale_y_continuous(breaks = seq(-50,50,25)) +
scale_fill_discrete(name = 'bars star rating', labels=c('one star', 'five star')) +
ylab("") +
ggthemes::theme_tufte() +
labs(
x = 'Top 20 Words',
y= 'Count',
title = 'Pyramid Plot of Top 20 Words, for one star bars and five star bars'
)
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
Sentiment analysis of reviews:
Positive v.s. negative words in the reivews of J Karaoke Bar
library(tidytext)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.0 v purrr 0.3.4
## v tidyr 1.2.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x NLP::annotate() masks ggplot2::annotate()
## x data.table::between() masks dplyr::between()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x purrr::map() masks maps::map()
## x purrr::transpose() masks data.table::transpose()
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(occurences = n(),
contribution = sum(value))
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(barreview %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A"))
calculate sentiment for “J Karaoke Bar”
J_Karaoke_reviews = star_five %>%
filter(business_id == "3pSUr_cdrphurO6m1HMP9A")
calculate_sentiment <- function(review)
{
sentiment_lines = review %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
ungroup() %>%
filter(words >= 10)
return(sentiment_lines)
}
sentiment_lines = calculate_sentiment(J_Karaoke_reviews)
head(sentiment_lines)
## # A tibble: 6 x 3
## user_id sentiment words
## <chr> <dbl> <int>
## 1 2wxtnu-tA8i9HjHD55iU6g 1.91 11
## 2 3iocNPlPThAG2ZaNtUo4TQ 0.733 30
## 3 8dxkcmAXY4ttrVFD1GhbdQ 1.67 21
## 4 aF0BTeTVRXv4OHYXMNH7SQ 2.12 17
## 5 bRzr5YuEIncFzG6_vYSwcw 1.73 11
## 6 buSbz1HfaHoXP3QGw2XV_Q 1.42 12
Display top 10 most positive reviews for 5 star bars:
display_pos_sentiments <- function(sentiment_lines,review_text)
{
pos_sentiment_lines = sentiment_lines %>%
arrange(desc(sentiment)) %>%
top_n(10, sentiment) %>%
inner_join(review_text, by = "user_id") %>%
select(date,sentiment,text)
datatable(pos_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}
display_pos_sentiments(sentiment_lines,J_Karaoke_reviews)
Positive v.s. negative words in the reivews of Triumph property management
positiveWordsBarGraph <- function(SC) {
contributions <- SC %>%
unnest_tokens(word, text) %>%
count(word,sort = TRUE) %>%
ungroup() %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(word) %>%
summarize(occurences = n(),
contribution = sum(value))
contributions %>%
top_n(20, abs(contribution)) %>%
mutate(word = reorder(word, contribution)) %>%
head(20) %>%
ggplot(aes(word, contribution, fill = contribution > 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + theme_bw()
}
positiveWordsBarGraph(barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ"))
Triumph_reviews = barreview %>%
filter(business_id == "Zh6fxrqsKqdSVmTK3roxBQ")
calculate_sentiment <- function(review)
{
sentiment_lines = review %>%
unnest_tokens(word, text) %>%
inner_join(get_sentiments("afinn"), by = "word") %>%
group_by(user_id) %>%
summarize(sentiment = mean(value),words = n()) %>%
ungroup() %>%
filter(words >= 10)
return(sentiment_lines)
}
sentiment_lines = calculate_sentiment(Triumph_reviews)
Display top 10 most negative reviews for Triumph property management:
display_neg_sentiments <- function(sentiment_lines,review_text)
{
neg_sentiment_lines = sentiment_lines %>%
arrange(desc(sentiment)) %>%
top_n(-10, sentiment) %>%
inner_join(review_text, by = "user_id") %>%
select(date,sentiment,text)
datatable(neg_sentiment_lines, style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
}
display_neg_sentiments(sentiment_lines,Triumph_reviews)
Visualize the geographical location of the top 10 five star bars(blue dot) and bottom 1 star or 1.5 star bars(yellow dot):
library(devtools)
## Loading required package: usethis
devtools::install_github("rstudio/leaflet")
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 4.0.4 was found. (Only the following incompatible version(s) of Rtools were found: 3.5)
##
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## Downloading GitHub repo rstudio/leaflet@HEAD
## utf8 (1.2.1 -> 1.2.2 ) [CRAN]
## cli (3.1.1 -> 3.3.0 ) [CRAN]
## colorspace (2.0-2 -> 2.0-3 ) [CRAN]
## vctrs (0.3.8 -> 0.4.1 ) [CRAN]
## magrittr (2.0.1 -> 2.0.3 ) [CRAN]
## fansi (0.4.2 -> 1.0.3 ) [CRAN]
## RColorBrewer (1.1-2 -> 1.1-3 ) [CRAN]
## tibble (3.1.0 -> 3.1.7 ) [CRAN]
## glue (1.4.2 -> 1.6.2 ) [CRAN]
## digest (0.6.27 -> 0.6.29 ) [CRAN]
## Rcpp (1.0.8 -> 1.0.8.3) [CRAN]
## ggplot2 (3.3.5 -> 3.3.6 ) [CRAN]
## sp (1.4-6 -> 1.4-7 ) [CRAN]
## xfun (0.29 -> 0.30 ) [CRAN]
## yaml (2.2.2 -> 2.3.5 ) [CRAN]
## jsonlite (1.7.3 -> 1.8.0 ) [CRAN]
## Installing 16 packages: utf8, cli, colorspace, vctrs, magrittr, fansi, RColorBrewer, tibble, glue, digest, Rcpp, ggplot2, sp, xfun, yaml, jsonlite
## Installing packages into 'C:/Users/lenove/Documents/R/win-library/4.0'
## (as 'lib' is unspecified)
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## cli 3.2.0 3.3.0 TRUE
##
## Binaries will be installed
## package 'utf8' successfully unpacked and MD5 sums checked
## package 'cli' successfully unpacked and MD5 sums checked
## package 'colorspace' successfully unpacked and MD5 sums checked
## package 'vctrs' successfully unpacked and MD5 sums checked
## package 'magrittr' successfully unpacked and MD5 sums checked
## package 'fansi' successfully unpacked and MD5 sums checked
## package 'glue' successfully unpacked and MD5 sums checked
## package 'digest' successfully unpacked and MD5 sums checked
## package 'Rcpp' successfully unpacked and MD5 sums checked
## package 'sp' successfully unpacked and MD5 sums checked
## package 'xfun' successfully unpacked and MD5 sums checked
## package 'yaml' successfully unpacked and MD5 sums checked
## package 'jsonlite' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lenove\AppData\Local\Temp\RtmpW8Y3GO\downloaded_packages
## WARNING: Rtools is required to build R packages, but no version of Rtools compatible with R 4.0.4 was found. (Only the following incompatible version(s) of Rtools were found: 3.5)
##
## Please download and install Rtools 4.0 from https://cran.r-project.org/bin/windows/Rtools/.
## * checking for file 'C:\Users\lenove\AppData\Local\Temp\RtmpW8Y3GO\remotes1ac8117e4863\rstudio-leaflet-0016c07/DESCRIPTION' ... OK
## * preparing 'leaflet':
## * checking DESCRIPTION meta-information ... OK
## * checking for LF line-endings in source and make files and shell scripts
## * checking for empty or unneeded directories
## Removed empty directory 'leaflet/docs'
## Removed empty directory 'leaflet/man-roxygen'
## * building 'leaflet_2.1.1.tar.gz'
##
LasvegasCoords = bar %>% filter(city == "Las Vegas")
center_lon = median(LasvegasCoords$longitude,na.rm = TRUE)
center_lat = median(LasvegasCoords$latitude,na.rm = TRUE)
map <- leaflet(rbind(fivestar,onestar)) %>%
addProviderTiles("Esri.NatGeoWorldMap") %>%
addCircles(lng = ~longitude, lat = ~latitude,radius = ~sqrt(review_count)) %>%
addCircleMarkers(data=fivestar,col="blue",group="fivestar") %>%
addCircleMarkers(data=onestar,color='yellow',group="onestar") %>%
#Layers control
addLayersControl(overlayGroups = c("fivestar","onestar"),
options = layersControlOptions(collapsed = FALSE)
) %>%
# controls
setView(lng=center_lon, lat=center_lat,zoom = 13)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
## Assuming "longitude" and "latitude" are longitude and latitude, respectively
map
Top five stars bars are more centralized and concentrated; while the worst one star bars are relatively more sparse in their location, and are not close to transportation hub.
Social network analysis & regression models:
There are mainly three parts in this analysis. p1, p8 are network graphs, p2-p5 are descriptive analysis graphs, p6-p7 are correlation graphs.
library(readr)
library(networkD3)
##
## Attaching package: 'networkD3'
## The following object is masked from 'package:leaflet':
##
## JS
## The following object is masked from 'package:DT':
##
## JS
library(visNetwork)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
## method from
## format.parameters_distribution datawizard
##Among the elite users who have reviewed any of bars in Vegas, what are their relationship? Is there some inside network between these celebrities in Yelp? Or do they just review and travel separately? To figure it out, I find those elite users who also has at least one elite user friend and construct a network between them. As a result, most of the links are individual and not connected to another links. There exists A-B-C chains, which means A is B’s friend, and B is C’s friend, but no classical social network “circle” or group. On the other word, A is not C’s friend. Thus, as Yelp is not a social media, but a review app, elite users don’t interact with each other in the app. It is more possible that every elite user is the center of his/her fans, not the center of his/her peers.
## Every user has reviewed one of the bars in Vegas
name <- user[, c("user_id", "friends")]
## 1. Network graph of top elite users(whose fans >= 100 and useful value >= 900)
net <- separate_rows(name, friends, sep = ",", convert = TRUE)
## No outside friends now
net <- net %>%
filter(friends %in% user_id)
## Delete users with no inside followers
net <- rename(net, source = user_id)
net <- rename(net, target = friends)
netnode <- user %>%
filter(user_id %in% net$source)
summary(netnode$average_stars)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.350 3.632 3.860 3.838 4.010 4.470
net <- as.data.frame(net)
p1 <- simpleNetwork(net,
nodeColour = "blue",
zoom=T,
fontSize = 16)
## There is no obvious network between the elite users
p1
## 8. Append: Network of bars
rb <- barreview %>%
group_by(business_id) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
head(50)
r50 <- barreview %>%
filter(business_id %in% rb$business_id) %>%
select(user_id, business_id)
## One bar have a branch in very close location also listed in top50, so actually the number of bars is 49
r50 <- left_join(r50, bar, by = "business_id")
r50 <- r50[, c("user_id", "name", "neighborhood", "review_count")]
r50$name<-gsub('["]',"", r50$name)
r50$name <- ifelse(r50$name == "Bachi Burger" & r50$neighborhood == "Southeast", "Bachi Burger(SE)", r50$name)
r50 <- mutate(r50, barnum = as.factor(name))
barlink <- NULL
for (i in 1 : 48) {
for (j in i : 48) {
a <- r50 %>% filter(as.numeric(barnum) == i)
a <- a$user_id
b <- r50 %>% filter(as.numeric(barnum) == j)
b <- b$user_id
c <- intersect(a, b)
d <- length(c)
tmp <- c(i, j, d)
barlink <- rbind(barlink, tmp)
}
}
barlink <- as.data.frame(barlink)
barlink <- barlink %>%
filter(V1 != V2) %>%
filter(V3 > 0)
barlink <- rename(barlink, source = V1, target = V2, value = V3)
barlink2 <- barlink %>% filter(value >= 50)
barlink2$value <- barlink2$value / 50
barlink2$source = barlink2$source - 1
barlink2$target = barlink2$target - 1
barnode <- r50[, c("barnum", "neighborhood", "review_count")] %>%
distinct()
barnode$neighborhood <- as.factor(barnode$neighborhood)
barnode$rcsize <- barnode$review_count / 100 - 10
p8 <- forceNetwork(Links = barlink2,
Nodes = barnode,
Source = "source",
Target = "target",
Value = "value",
NodeID = "barnum",
Nodesize = "rcsize",
Group = "neighborhood",
opacity = 0.6, zoom = TRUE)
p8
## Descriptive graph
rc <- barreview %>%
group_by(business_id) %>%
summarize(avguseful = mean(useful), count = n())
bar <- inner_join(bar, rc, by = "business_id")
rmost <- bar %>%
arrange(desc(count)) %>%
head(10)
rmost[8, 4] <- "Downtown"
rmost <- rmost[, c("count", "name", "neighborhood", "categories")]
rmost <- rmost[order(rmost[ ,"count"], decreasing = TRUE), ]
reviewgraph <- ggplot(rmost, aes(x = count, y = reorder(name, count), fill = neighborhood, text = paste("categories:", categories))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of bar")
## 2. Which bars are reviewed most?
p2 <- ggplotly(reviewgraph)
p2
neighborhood <- bar %>%
group_by(neighborhood) %>%
summarize(avgstar = mean(stars), count = n())
neighborhood[1, 1] <- "Not known"
neighborhoodgraph <- ggplot(neighborhood, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of neighborhood")
## 3. Which neighborhood has most bars?
p3 <- ggplotly(neighborhoodgraph)
p3
topstar <- mutate(bar, topstar = ifelse(stars >= 4 & review_count >= 100, "Top bars", "Non top bars"))
toppct <- topstar %>%
group_by(topstar) %>%
summarize(count = n(), avgreview = mean(review_count))
## 4. The gap between topbars and non top bars
p4 <- tab_df(toppct)
p4
| topstar | count | avgreview |
|---|---|---|
| Non top bars | 1108 | 108.22 |
| Top bars | 240 | 529.58 |
topstar <- topstar %>%
filter(topstar == "Top bars") %>%
group_by(neighborhood) %>%
summarize(count = n(), avgstar = mean(stars))
topstar[1, 1] <- "Not known"
topbargraph <- ggplot(topstar, aes(x = count, y = reorder(neighborhood, count), fill = round(avgstar, 2))) +
geom_bar(stat="identity", width=1, color="white") +
labs(x="review count", y="name of neighborhood")
## 5. Which neighborhood has most top bars?
p5 <- ggplotly(topbargraph)
p5
## 6. What influences the elite users' review scores most who has reviewed in Las Vegas.
lm1 <- lm(average_stars ~ review_count + fans + useful + compliment_hot, user)
summary(lm1)
##
## Call:
## lm(formula = average_stars ~ review_count + fans + useful + compliment_hot,
## data = user)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.87542 -0.16202 0.01053 0.17069 0.75058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.896e+00 1.803e-02 216.130 < 2e-16 ***
## review_count -8.430e-05 1.598e-05 -5.274 1.77e-07 ***
## fans 1.525e-04 4.488e-05 3.397 0.000719 ***
## useful 1.548e-07 6.045e-07 0.256 0.798000
## compliment_hot -2.032e-06 5.988e-06 -0.339 0.734405
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2631 on 718 degrees of freedom
## Multiple R-squared: 0.04107, Adjusted R-squared: 0.03573
## F-statistic: 7.689 on 4 and 718 DF, p-value: 4.549e-06
userstars <- ggplot(data = user, aes(x = fans, y = average_stars)) +
stat_smooth(method = "lm", col = "blue") +
xlab("Number of fans") + ylab("Average stars")
p6 <- ggplotly(userstars)
## `geom_smooth()` using formula 'y ~ x'
p6
# 7. Does the amount of review have a positive correlation with the bars' reputation in Las Vegas.
lm2 <- lm(stars ~ review_count, bar, na.action = na.omit)
summary(lm2)
##
## Call:
## lm(formula = stars ~ review_count, data = bar, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6092 -0.6094 -0.1104 0.3878 1.3911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.609e+00 2.044e-02 176.539 < 2e-16 ***
## review_count 1.405e-04 5.089e-05 2.762 0.00583 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6678 on 1346 degrees of freedom
## Multiple R-squared: 0.005634, Adjusted R-squared: 0.004895
## F-statistic: 7.627 on 1 and 1346 DF, p-value: 0.005829
barstars <- ggplot(data = bar, aes(x = review_count, y = stars)) +
stat_smooth(method = "lm", col = "blue") +
xlab("Number of reviews") + ylab("Average stars")
p7 <- ggplotly(barstars)
## `geom_smooth()` using formula 'y ~ x'
p7